{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from tqdm import tqdm\n",
    "with open('meta_Video_Games.json') as f:\n",
    "    metadata = [json.loads(line) for line in f]\n",
    "with open('Video_Games_5.json') as f:\n",
    "    reviews = [json.loads(line) for line in f]\n",
    "users = set()\n",
    "items = set()\n",
    "for review in tqdm(reviews):\n",
    "    users.add(review['reviewerID'])\n",
    "    items.add(review['asin'])\n",
    "item2id = dict()\n",
    "count = 0\n",
    "for item in items:\n",
    "    item2id[item] = count\n",
    "    count += 1\n",
    "print(len(users), len(items), len(reviews), len(reviews) / (len(users) * len(items)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "id_title = {}\n",
    "id_item = {}\n",
    "cnt = 0\n",
    "for meta in tqdm(metadata):\n",
    "    if len(meta['title']) > 1: # remove the item without title\n",
    "        id_title[meta['asin']] = meta['title']\n",
    "\n",
    "users = dict()\n",
    "for review in tqdm(reviews):\n",
    "    user = review['reviewerID']\n",
    "    if 'asin' not in review:\n",
    "        break\n",
    "    item = review['asin']\n",
    "    if item not in id_title:\n",
    "        continue\n",
    "    if review['asin'] not in id_item:\n",
    "        id_item[review['asin']] = cnt\n",
    "        cnt += 1\n",
    "    if 'overall' not in review:\n",
    "        continue\n",
    "    if 'unixReviewTime' not in review:\n",
    "        continue\n",
    "    if user not in users:\n",
    "        users[user] = {\n",
    "            'items': [],\n",
    "            'ratings': [],\n",
    "            'timestamps': [],\n",
    "            'reviews': []\n",
    "        }\n",
    "    users[user]['items'].append(item)\n",
    "    users[user]['ratings'].append(review['overall'])\n",
    "    users[user]['timestamps'].append(review['unixReviewTime'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user_id = 0\n",
    "interactions = []\n",
    "B = []\n",
    "for key in tqdm(users.keys()):\n",
    "    items = users[key]['items']\n",
    "    ratings = users[key]['ratings']\n",
    "    timestamps = users[key]['timestamps']\n",
    "    all = list(zip(items, ratings, timestamps))\n",
    "    res = sorted(all, key=lambda x: int(x[-1]))\n",
    "    items, ratings, timestamps = zip(*res)\n",
    "    items, ratings, timestamps = list(items), list(ratings), list(timestamps)\n",
    "    users[key]['items'] = items\n",
    "    users[key]['item_ids'] = [item2id[x] for x in items]\n",
    "    users[key]['item_titles'] = [id_title[x] for x in items]\n",
    "    users[key]['ratings'] = ratings\n",
    "    users[key]['timestamps'] = timestamps\n",
    "    for i in range(min(10, len(items) - 1), len(items)):\n",
    "        st = max(i - 10, 0)\n",
    "        interactions.append([key, users[key]['items'][st: i], users[key]['items'][i], users[key]['item_ids'][st: i], users[key]['item_ids'][i], users[key]['item_titles'][st: i], users[key]['item_titles'][i], ratings[st: i], ratings[i], int(timestamps[i])])   \n",
    "print(len(interactions))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "interactions = sorted(interactions, key=lambda x: x[-1])\n",
    "import csv\n",
    "with open('./train.csv', 'w') as f:\n",
    "    csvwriter = csv.writer(f)\n",
    "    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])\n",
    "    csvwriter.writerows(interactions[:int(len(interactions) * 0.8)])\n",
    "with open('./valid.csv', 'w') as f:\n",
    "    csvwriter = csv.writer(f)\n",
    "    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])\n",
    "    csvwriter.writerows(interactions[int(len(interactions) * 0.8):int(len(interactions) * 0.9)])\n",
    "with open('./test.csv', 'w') as f:\n",
    "    csvwriter = csv.writer(f)\n",
    "    csvwriter.writerow(['user_id', 'item_asins', 'item_asin', 'history_item_id', 'item_id', 'history_item_title', 'item_title', 'history_rating', 'rating', 'timestamp'])\n",
    "    csvwriter.writerows(interactions[int(len(interactions) * 0.9):])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import random\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "def csv_to_json(input_path, output_path, sample=False):\n",
    "    data = pd.read_csv(input_path)\n",
    "    if sample:\n",
    "        data = data.sample(n=5000, random_state=42).reset_index(drop=True)\n",
    "        data.to_csv(output_path[:-5] + \".csv\", index=False)\n",
    "    json_list = []\n",
    "    for index, row in tqdm(data.iterrows()):\n",
    "        row['history_item_title'] = eval(row['history_item_title'])\n",
    "        row['history_rating'] = eval(row['history_rating'])\n",
    "        L = len(row['history_item_title'])\n",
    "        history = \"The user has played the following video games before:\"\n",
    "        for i in range(L):\n",
    "            if i == 0:\n",
    "                history += \"\\\"\" + row['history_item_title'][i] + \"\\\"\"\n",
    "            else:\n",
    "                history += \", \\\"\" + row['history_item_title'][i] + \"\\\"\"\n",
    "        target_movie = str(row['item_title'])\n",
    "        target_movie_str = \"\\\"\" + target_movie + \"\\\"\"\n",
    "        json_list.append({\n",
    "            \"instruction\": \"Given a list of video games the user has played before, please recommend a new video game that the user likes to the user.\",\n",
    "            \"input\": f\"{history}\\n \",\n",
    "            \"output\": target_movie_str,\n",
    "        })        \n",
    "    with open(output_path, 'w') as f:\n",
    "        json.dump(json_list, f, indent=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_to_json('./train.csv', './train.json')\n",
    "csv_to_json('./valid.csv', './valid.json')\n",
    "csv_to_json('./test.csv', './test.json')\n",
    "csv_to_json('./valid.csv', './valid_5000.json', sample=True)\n",
    "csv_to_json('./test.csv', './test_5000.json', sample=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "alpaca_lora",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
